This script puts together clinical data from multiple studies in order to estimate probabilities that Kenyan children enrolled in a large observational cohort had severe malaria.
This script can only be run with access to original datasets - need to contact XXXXX.
Data used to build the reference model of severe malaria:
Load data
load('../RData/kemri_case_data.RData')
load('../RData/kemri_control_data.RData')
kemri_case_data$sickle_trait = as.numeric(kemri_case_data$sickle==2)
kemri_control_data$sickle_trait = as.numeric(kemri_control_data$sickle==2)
writeLines('Distribution of the number of WT alleles (A) at rs_334 in the severe malaria cases:')
## Distribution of the number of WT alleles (A) at rs_334 in the severe malaria cases:
table(kemri_case_data$sickle-1, useNA = 'ifany')
##
## 0 1 2 <NA>
## 2145 57 11 7
writeLines('Distribution of the number of WT alleles (A) at rs_334 in the population controls:')
## Distribution of the number of WT alleles (A) at rs_334 in the population controls:
table(kemri_control_data$sickle-1, useNA = 'ifany')
##
## 0 1 2
## 3313 594 33
Do imputation for missing variables
K_imputes = 10
dat = kemri_case_data
dat$baseexc[dat$baseexc > 30]= NA
dat$BD = -dat$baseexc
dat$log10_platelets = log10(dat$platelet)
dat$log10_wbc = log10(dat$wbc)
dat$log10_creatinine = log10(dat$creat)
dat$log10_creatinine[is.infinite(dat$log10_creatinine)] = NA
dat = dat[, -which(colnames(dat)=='baseexc')]
vars_interest = colnames(dat)
writeLines('All available variables:')
## All available variables:
print(colnames(dat))
## [1] "sample_code" "pat_id" "source_code"
## [4] "ethnic" "agemths" "ageyr"
## [7] "sex" "died" "muac"
## [10] "height" "weight" "bpd"
## [13] "bps" "pul_hrate" "crefill"
## [16] "tempaxil" "oxysat" "prostration"
## [19] "severe_malaria_anaemia" "cerebral_malaria" "respiratory_distress"
## [22] "parasite_gn" "hb" "rbc"
## [25] "hct" "mcv" "platelet"
## [28] "na" "k" "creat"
## [31] "glucose" "hco3" "g6pd_202"
## [34] "g6pd_376" "thal" "sickle"
## [37] "wbc" "bcstot" "resp_rate"
## [40] "bcs_verbal" "bcs_motor" "bcs_eyes"
## [43] "convulsions" "bcs_total" "IID"
## [46] "bloodculture_pos" "bacteria" "case_control"
## [49] "log_parasites" "sickle_trait" "BD"
## [52] "log10_platelets" "log10_wbc" "log10_creatinine"
vars_interest = vars_interest[!vars_interest%in%
c('ageyr','IID','case_control','thal',
'bacteria',"sample_code",
'sickle','g6pd_202','g6pd_376',
"pat_id","source_code","parasite_gn",
'bcs_total', 'bcs_eyes','bcs_motor',
'bcs_verbal', 'platelet', 'wbc','creat')]
Missing_data = is.na(dat[,vars_interest])
par(las=1, mar=c(4,9,2,2))
image(x = 1:nrow(dat), y = 1:length(vars_interest),
Missing_data, yaxt='n',ylab='',xaxt='n', xlab='')
axis(2, at = 1:length(vars_interest),
labels = tolower(vars_interest),cex=.7)
writeLines(sprintf('There is clinical data on %s individuals', nrow(dat)))
## There is clinical data on 2220 individuals
apply(kemri_case_data, 2, function(x) mean(is.na(x)))
## sample_code pat_id source_code
## 0.000000000 0.000000000 0.000000000
## ethnic agemths ageyr
## 0.000000000 0.001351351 0.001351351
## sex died muac
## 0.000000000 0.005855856 0.274324324
## height weight bpd
## 0.259909910 0.013963964 0.530180180
## bps pul_hrate crefill
## 0.501801802 0.225225225 0.164864865
## tempaxil oxysat prostration
## 0.050450450 0.155855856 0.019819820
## severe_malaria_anaemia cerebral_malaria respiratory_distress
## 0.000000000 0.070270270 0.029729730
## parasite_gn hb rbc
## 0.016216216 0.000000000 0.044594595
## hct mcv platelet
## 0.050000000 0.046396396 0.177927928
## na k creat
## 0.150000000 0.150450450 0.232432432
## glucose hco3 baseexc
## 0.268468468 0.245945946 0.222972973
## g6pd_202 g6pd_376 thal
## 0.000000000 0.005405405 0.045495495
## sickle wbc bcstot
## 0.003153153 0.002252252 0.070270270
## resp_rate bcs_verbal bcs_motor
## 0.180630631 0.377027027 0.377027027
## bcs_eyes convulsions bcs_total
## 0.377027027 0.558108108 0.377927928
## IID bloodculture_pos bacteria
## 0.025225225 0.000000000 0.964864865
## case_control log_parasites sickle_trait
## 0.000000000 0.016216216 0.003153153
# redo this with platelets and wbc on log scale
if(RE_RUN_IMPUTATION){
set.seed(8758768)
registerDoParallel(cores = 6)
imputed_data_KEMRI = list()
my_dat = dat[,vars_interest]
discrete_vars = c('sex','ethnic','died','prostration','cerebral_malaria',
'respiratory_distress','bloodculture_pos','severe_malaria_anaemia')
for(cc in discrete_vars){
my_dat[,cc] = as.factor(my_dat[,cc])
}
for(k in 1:K_imputes){
out_impute = missForest(xmis = my_dat, variablewise = T,
ntree = 200,
decreasing = T,verbose = T,
parallelize = 'forests')
out_impute$ximp$hypo_glycaemic = out_impute$ximp$glucose<2.2
imputed_data_KEMRI[[k]] = out_impute$ximp
}
save(imputed_data_KEMRI, file = '../RData/Imputed_Datasets_KEMRI.RData')
} else {
load('../RData/Imputed_Datasets_KEMRI.RData')
}
AQ study - impute missing data
# Load the AQ Vietnam data
aq_dat = readstata13::read.dta13('~/Dropbox/Datasets/AQ study/Vietnam AQ.DTA')
aav_dat = read.csv('~/Dropbox/Datasets/AllMalariaDataBackUp/AAV_Vietnam/AAV.csv')
aav_dat[aav_dat==999999] = NA
aq_dat_training = data.frame(platelets_log10 = log10(aq_dat$admplat/10^3),
wbc_log10 = log10(aq_dat$admwbc/10^3),
parasitaemia_log10 = log10(aq_dat$admpct),
hct = aq_dat$admhct,
age = aq_dat$age,
outcome = aq_dat$outcome,
gcs = aq_dat$admgcs,
creatinine_log10 = log10(aq_dat$admcrea),
sex = aq_dat$sex)
aav_dat_training = data.frame(platelets_log10 = log10(aav_dat$admplt/10^3),
wbc_log10 = log10(aav_dat$admwbc/10^3),
parasitaemia_log10 = log10(aav_dat$admparasitemia),
hct = aav_dat$initialhct,
age = aav_dat$age,
outcome = aav_dat$mortalityoutcome,
gcs = aav_dat$gcstotal,
creatinine_log10 = log10(aav_dat$admcrea),
sex = aav_dat$sex)
Vietnam_training = rbind(aq_dat_training, aav_dat_training)
apply(Vietnam_training, 2, function(x) round(100*mean(is.na(x)),1))
## platelets_log10 wbc_log10 parasitaemia_log10 hct
## 7.2 4.1 0.0 0.0
## age outcome gcs creatinine_log10
## 0.0 0.0 0.0 0.5
## sex
## 0.0
par(las=1, mar = c(2,9,2,2))
image(x = 1:nrow(Vietnam_training),
y = 1:ncol(Vietnam_training), is.na(Vietnam_training),
yaxt='n',ylab='',xaxt='n', xlab='')
axis(2, at = 1:ncol(Vietnam_training), labels = tolower(colnames(Vietnam_training)),cex=.7)
if(RE_RUN_IMPUTATION){
set.seed(8758768)
registerDoParallel(cores = 6)
imputed_data_Viet = list()
my_dat = Vietnam_training
discrete_vars = c('sex','outcome')
for(cc in discrete_vars){
my_dat[,cc] = as.factor(my_dat[,cc])
}
for(k in 1:K_imputes){
out_impute = missForest(xmis = my_dat, variablewise = T,
ntree = 200,
decreasing = T,verbose = T,
parallelize = 'forests')
imputed_data_Viet[[k]] = out_impute$ximp
}
save(imputed_data_Viet, file = '../RData/Imputed_Datasets_Vietnam.RData')
} else {
load('../RData/Imputed_Datasets_Vietnam.RData')
}
# make an imputed dataset just for platelets and white counts
imputed_Vietam_minimal = list()
for(k in 1:K_imputes){
imputed_Vietam_minimal[[k]] =
imputed_data_Viet[[k]][, c('platelets_log10','wbc_log10','age')]
}
save(imputed_Vietam_minimal, file = 'Inputs/imputed_Vietam_minimal.RData')
writeLines(sprintf('We include a total of %s adults from Vietnam',nrow(Vietnam_training)))
## We include a total of 930 adults from Vietnam
MORU core malaria (severe patients only)
# Core Malaria - MORU: children with severe malaria
# prepared using file extract_data.R in the hrp2 folder
load(file = '../RData/Core_malaria_hrp2_platelets.RData')
# filter out one super low platelet count (probably data entry mistake)
dat_core = core_data[core_data$platelets>1, ]
print(table(dat_core$country))
##
## Bangladesh Thailand
## 508 148
hist(dat_core$age, breaks = seq(0,80,by=5))
writeLines(sprintf('We include a total of %s patients from the Bangladesh and Thailand',nrow(dat_core)))
## We include a total of 656 patients from the Bangladesh and Thailand
FEAST
load(file = '../RData/feast_platelet_white_count_data.RData')
# select the patients with high HRP2 and parasite counts as training data for the reference model
ind_feast_training = !is.na(dat_feast$platelets) &
!is.na(dat_feast$white_count) &
!is.na(dat_feast$hrp2) &
!is.na(dat_feast$parasitaemia) &
dat_feast$hrp2 > 1000 & dat_feast$parasitaemia > 1000
writeLines(sprintf('The diagnostic model training data includes a total of %s patients from the FEAST study',
sum(ind_feast_training)))
## The diagnostic model training data includes a total of 121 patients from the FEAST study
ind_platelet_hrp2 = !is.na(dat_feast$platelets) &
!is.na(dat_feast$hrp2)
xx = dat_feast[ind_platelet_hrp2, c('platelets','hrp2','parasitaemia')]
write.csv(xx, file = 'Inputs/FEAST_platelets_hrp2.csv',
row.names = F)
writeLines(sprintf('We use a total of %s patients from the FEAST study to show the relationship between PfHRP2 and platelet counts',
sum(ind_platelet_hrp2)))
## We use a total of 567 patients from the FEAST study to show the relationship between PfHRP2 and platelet counts
We use the data from FEAST to estimate the age-related trend in white counts
# Age model of white counts
wbc_age_model = mgcv::gam(formula = log10_wbc ~ s(age),
data = data.frame(age=dat_feast$age,
log10_wbc=log10(dat_feast$white_count)))
summary(wbc_age_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log10_wbc ~ s(age)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.108483 0.009394 118 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 2.321 2.908 12.55 8.59e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.041 Deviance explained = 4.36%
## GCV = 0.076018 Scale est. = 0.075724 n = 858
# save model for reproducibility
save(wbc_age_model, file = 'Inputs/wbc_age_model')
par(las=1, cex.lab=1.5, cex.axis=1.5)
plot((0:120)/12, predict(wbc_age_model, newdata=data.frame(age=0:120)),
type='l',lwd=3, xlab='Age (years)', ylab ='White blood cell count (x1000 per uL)',
yaxt='n', ylim = range(log10(dat_feast$white_count),na.rm = T))
axis(2, at = seq(0,2, by=0.5), labels = round(10^seq(0,2, by=0.5),1))
points(jitter(dat_feast$age/12), log10(dat_feast$white_count), pch=20)
# estimate the expected baseline white count in severe disease in children older than 60 months
baseline_log10_wbc = mean(log10(dat_feast$white_count[dat_feast$age>60]),
na.rm = T)
dat_feast$wbc_log10 = log10(dat_feast$white_count)
# calculate an age-corrected white count for FEAST children
dat_feast$wbc_log10_corrected = dat_feast$wbc_log10
ind = dat_feast$age<60 & !is.na(dat_feast$white_count)
dat_feast$wbc_log10_corrected[ind] =
dat_feast$wbc_log10_corrected[ind]-
(predict(wbc_age_model,
newdata=data.frame(age=dat_feast$age[ind]))-baseline_log10_wbc)
dat_feast_training =
data.frame(platelets_log10 = log10(dat_feast$platelets)[which(ind_feast_training)],
wbc_log10 = dat_feast$wbc_log10[which(ind_feast_training)],
wbc_log10_corrected = dat_feast$wbc_log10_corrected[which(ind_feast_training)])
# calculate an age-corrected white count for Core Malaria children
ind = !is.na(dat_core$age)&dat_core$age<5
sum(ind)
## [1] 5
dat_core$log10_wbc = log10(dat_core$wbc)
dat_core$platelets_log10 = log10(dat_core$platelets)
dat_core$log10_wbc_corrected = log10(dat_core$wbc)
dat_core$log10_wbc_corrected[ind] = dat_core$log10_wbc[ind]-
(predict(wbc_age_model, newdata=data.frame(age=dat_core$age[ind]*12))-baseline_log10_wbc)
# calculate an age-corrected white count for Kenyan children
ind = !is.na(kemri_case_data$agemths) & kemri_case_data$agemths<60
# extract an imputed wbc set
kemri_case_data$log10_wbc_imputed = imputed_data_KEMRI[[1]]$log10_wbc
kemri_case_data$log10_platelet_imputed = imputed_data_KEMRI[[1]]$log10_platelets
kemri_case_data$log10_wbc_imputed_corrected = kemri_case_data$log10_wbc_imputed
kemri_case_data$log10_wbc_imputed_corrected[ind] = kemri_case_data$log10_wbc_imputed[ind]-
(predict(wbc_age_model, newdata=data.frame(age=kemri_case_data$agemths[ind]))-baseline_log10_wbc)
QQ plots comparing studies
par(mfrow=c(2,2), las= 1, family='serif', cex.lab=1.5, cex.axis=1.5)
qqplot(imputed_Vietam_minimal[[1]]$wbc_log10, # no crrection needed s all adults
dat_feast_training$wbc_log10_corrected,
xlim=c(0,2), ylim=c(0,2),panel.first=grid(),
xlab='White count (Vietnamese adults)',
ylab='White count (FEAST)', xaxt='n',yaxt='n',pch=16)
axis(side = 1, at = 0:2, labels = 10^(0:2))
axis(side = 1, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
axis(side = 2, at = 0:2, labels = 10^(0:2))
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
lines(0:5, 0:5)
qqplot(dat_core$log10_wbc_corrected,
dat_feast_training$wbc_log10_corrected,
xlab='White count (Bangladesh/Thailand)',
xlim=c(0,2), ylim=c(0,2),panel.first=grid(),
ylab='White count (FEAST)', xaxt='n',yaxt='n',pch=16)
axis(side = 1, at = 0:2, labels = 10^(0:2))
axis(side = 1, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
axis(side = 2, at = 0:2, labels = 10^(0:2))
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
lines(0:5, 0:5)
qqplot(c(dat_core$log10_wbc_corrected,imputed_data_Viet[[1]]$wbc_log10),
dat_feast_training$wbc_log10_corrected,
xlab='White count (Asia)',
xlim=c(0,2), ylim=c(0,2),panel.first=grid(),
ylab='White count (FEAST)', xaxt='n',yaxt='n',pch=16)
axis(side = 1, at = 0:2, labels = 10^(0:2))
axis(side = 1, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
axis(side = 2, at = 0:2, labels = 10^(0:2))
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))),
tick = T, labels = F)
lines(0:5, 0:5)
#************* Platelets ***************
par(mfrow=c(2,2), las= 1, family='serif', cex.lab=1.5, cex.axis=1.5)
qqplot(imputed_Vietam_minimal[[1]]$platelets_log10,
dat_feast_training$platelets_log10,
xlab='Platelet count (Vietnamese adults)',
panel.first=grid(),
xlim = c(1,3), ylim=c(1,3),
ylab='Platelet count (FEAST)', pch=16, xaxt='n',yaxt='n')
axis(side = 1, at = 1:3, labels = 10^(1:3))
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
axis(side = 2, at = 1:3, labels = 10^(1:3))
axis(side = 2, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
lines(0:5, 0:5)
qqplot(log10(dat_core$platelets),
dat_feast_training$platelets_log10,
panel.first=grid(),
xlim = c(1,3), ylim=c(1,3),
xlab='Platelet count (Bangladesh/Thailand)',
ylab='Platelet count (FEAST)', pch=16, xaxt='n',yaxt='n')
axis(side = 1, at = 1:3, labels = 10^(1:3))
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
axis(side = 2, at = 1:3, labels = 10^(1:3))
axis(side = 2, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
lines(0:5, 0:5)
qqplot(c(log10(dat_core$platelets), imputed_Vietam_minimal[[1]]$platelets_log10),
dat_feast_training$platelets_log10,
xlim = c(1,3), ylim=c(1,3),
panel.first=grid(),
xlab='Platelet count (Asia)',
ylab='Platelet count (FEAST)', pch=16, xaxt='n',yaxt='n')
axis(side = 1, at = 1:3, labels = 10^(1:3))
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
axis(side = 2, at = 1:3, labels = 10^(1:3))
axis(side = 2, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))),
tick = T, labels = F)
lines(0:5, 0:5)
QQ plots looking at marginal normality
k=1
dat_Training = rbind(imputed_Vietam_minimal[[k]][, c('platelets_log10', 'wbc_log10')],
rename(dat_feast_training[, c('platelets_log10', 'wbc_log10_corrected')],
wbc_log10=wbc_log10_corrected),
rename(dat_core[, c('platelets_log10', 'log10_wbc_corrected')],
wbc_log10=log10_wbc_corrected))
writeLines('Dimensions of training dataset:')
## Dimensions of training dataset:
print(dim(dat_Training))
## [1] 1707 2
par(las=1, bty='n', mfrow=c(1,2), family='serif',
cex.lab=1.5, cex.axis=1.5)
qqnorm(scale(dat_Training$platelets_log10), pch=20,
panel.first=grid(), main = 'Log10 platelet counts')
lines(-10:10, -10:10, lwd=2, lty=2)
qqnorm(scale(dat_Training$wbc_log10), pch=20,
panel.first=grid(), main = 'Log10 white counts')
lines(-10:10, -10:10, lwd=2, lty=2)
Make curated dataset
dat_kenya = kemri_case_data[ , c('agemths',
'platelet', # observed platelets
'wbc', # observed white counts
'log10_platelet_imputed', # imputed white counts (log10 scale)
'log10_wbc_imputed', # imputed white counts (log10 scale)
'log10_wbc_imputed_corrected')] #with age correction
dat_kenya$HbAS = kemri_case_data$sickle_trait
dim(dat_kenya)
## [1] 2220 7
dim(dat_Training)
## [1] 1707 2
dim(dat_feast_training)
## [1] 121 3
save(dat_kenya, dat_Training, dat_feast_training, dat_core,
file = 'Inputs/curated_modelling_dataset.RData')